library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE, cache.path = "../../gitignore/RmdCache/", # keep heavy data in gitignore cache
fig.path = "Rfigures/Rmd/")
machine="mythinkpad" # define the machine we work on
loadALL = FALSE # only load CpG shared by half fish per trt group
loadannot = TRUE # load genome annotations
sourceDMS = TRUE # Load the calculated DMS
sourceSubUnite = FALSE
source("R02.3_DATALOAD.R")
## Features Annotation (use package genomation v1.24.0)
## NB Promoters are defined by options at genomation::readTranscriptFeatures function.
## The default option is to take -1000,+1000bp around the TSS and you can change that.
## -> following Heckwolf 2020 and Sagonas 2020, we consider 1500bp upstream and 500 bp downstream
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
par(mfrow=c(1,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
diffAnn_PAR
## promoter exon intron intergenic
## 8.58 15.84 34.13 45.72
## promoter exon intron intergenic
## 8.58 13.19 32.51 45.72
## promoter exon intron
## 1.07 0.19 0.44
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3020 8128 15141 19226 300247
genomation::plotTargetAnnotation(diffAnn_PAR,precedence=TRUE, main="DMS G1", cex.legend = 1, border="white")
## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
diffAnn_G2_controlG1
## promoter exon intron intergenic
## 11.28 21.05 30.16 44.44
## promoter exon intron intergenic
## 11.28 16.21 28.07 44.44
## promoter exon intron
## 0.38 0.07 0.14
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11 2602 6295 14212 15906 261017
genomation::plotTargetAnnotation(diffAnn_G2_controlG1,precedence=TRUE, main="DMS G2-G1c", cex.legend = 1, border="white")
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
diffAnn_G2_infectedG1
## promoter exon intron intergenic
## 12.90 13.62 34.64 46.81
## promoter exon intron intergenic
## 12.90 9.42 30.87 46.81
## promoter exon intron
## 0.28 0.03 0.08
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9 2355 7150 12285 14773 109527
genomation::plotTargetAnnotation(diffAnn_G2_infectedG1,precedence=TRUE, main="DMS G2-G1i", cex.legend = 1, border="white")
par(mfrow=c(1,1))
## Run the function to get DMS info
DMS_info_G1 <- myDMSinfo(DMSlist$DMS_15pc_G1_C_T, uniteCov6_G1_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1c_final <- myDMSinfo(DMSlist$DMS_15pc_CC_CT, uniteCov14_G2_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1i_final <- myDMSinfo(DMSlist$DMS_15pc_TC_TT,uniteCov14_G2_woSexAndUnknowChrOVERLAP)
NB Kostas’ results: “We found a total of 1,973 CpG sites out of 1,172,887 CpGs (0.17%) across the genome that showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish”
Here: we obtain out a total of 1001880 CpG sites: 3648 (0.36%) showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish in the parental group; 1197 (0.12%) in the offspring from control parents comparison; 690 (0.07%) in the offspring from infected parents comparison.
## Chi2 test: are the number of DMS from G2-G1C and G2-G1I overlapping with DMSpar statistically different?
A=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1c_final$DMS))
B=length(DMS_info_G2_G1c_final$DMS)
C=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1i_final$DMS))
D=length(DMS_info_G2_G1i_final$DMS)
Observed=matrix(c(A, B-A, C, D-C),nrow=2)
Observed
## [,1] [,2]
## [1,] 106 59
## [2,] 1091 631
chisq.test(Observed)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Observed
## X-squared = 0.019909, df = 1, p-value = 0.8878
## not statistically different
## output Venn diagrams
allVenn <- ggVennDiagram(list("DMS G1" = DMS_info_G1$DMS, "DMS G2-c" = DMS_info_G2_G1c_final$DMS, "DMS G2-i" = DMS_info_G2_G1i_final$DMS), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
hypoVenn <- ggVennDiagram(list("DMS G1\nhypo" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hypo"],
"DMS G2-c\nhypo" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hypo"],
"DMS G2-i\nhypo" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hypo"]), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
hyperVenn <- ggVennDiagram(list("DMS G1\nhyper" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hyper"],
"DMS G2-c\nhyper" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hyper"],
"DMS G2-i\nhyper" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hyper"]), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
ggarrange(allVenn,
ggarrange(hypoVenn, hyperVenn, ncol = 2, legend = "none"),
nrow = 2, widths = c(.5,1))
runHyperHypoAnnot <- function(){
par(mfrow=c(2,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
####### HYPO
## Parents comparison:
A = annotateWithGeneParts(
as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(A,precedence=TRUE, main="DMS G1\nhypo",
cex.legend = .4, border="white")
## Offspring from control parents comparison:
B = annotateWithGeneParts(
as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(B,precedence=TRUE, main="DMS G2-G1c\nhypo",
cex.legend = .4, border="white")
## Offspring from infected parents comparison:
C = annotateWithGeneParts(
as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(C,precedence=TRUE, main="DMS G2-G1i\nhypo",
cex.legend = .4, border="white")
####### HYPER
## Parents comparison:
D = annotateWithGeneParts(
as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(D,precedence=TRUE, main="DMS G1\nhyper",
cex.legend = .4, border="white")
## Offspring from control parents comparison:
E = annotateWithGeneParts(
as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(E,precedence=TRUE, main="DMS G2-G1c\nhyper",
cex.legend = .4, border="white")
## Offspring from infected parents comparison:
f = annotateWithGeneParts(
as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(f,precedence=TRUE, main="DMS G2-G1i\nhyper",
cex.legend = .4, border="white")
par(mfrow=c(1,1))
return(list(G1hypo=A, G2G1chypo=B, G2G1ihypo=C, G1hyper=D, G2G1chyper=E, G2G1ihyper=f))
}
myannot=runHyperHypoAnnot()
############################################################
## Venn diagram of overlapping features by their annotation:
table(rowSums(as.data.frame(myannot$G1hypo@members))) # NB: some positions are labelled with several features!
##
## 0 1 2 3
## 559 566 49 2
## as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"
myAnnotateDMS <- function(DMS, annot){
## sanity check
if (nrow(DMS) != nrow(annot)){"STOP error in arguments"}
DMS$pos <- paste(DMS$chr, DMS$start, DMS$end)
## NB as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"
DMS$feature <- NA
## 1. promoters
DMS$feature[which(annot$prom == 1)] = "promoter"
## 2. exons
DMS$feature[which(annot$exon == 1 & annot$prom ==0)] = "exon"
## 3. intron
DMS$feature[which(annot$intro == 1 & annot$exon == 0 & annot$prom ==0)] = "intron"
## 4. intergenic regions
DMS$feature[which(annot$intro == 0 & annot$exon == 0 & annot$prom ==0)] = "intergenic"
return(DMS)
}
DMSlist$DMS_15pc_G1_C_T = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T, as.data.frame(diffAnn_PAR@members))
DMSlist$DMS_15pc_G1_C_T_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],
as.data.frame(myannot$G1hypo@members))
DMSlist$DMS_15pc_G1_C_T_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],
as.data.frame(myannot$G1hyper@members))
DMSlist$DMS_15pc_CC_CT = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT, as.data.frame(diffAnn_G2_controlG1@members))
DMSlist$DMS_15pc_CC_CT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],
as.data.frame(myannot$G2G1chypo@members))
DMSlist$DMS_15pc_CC_CT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],
as.data.frame(myannot$G2G1chyper@members))
DMSlist$DMS_15pc_TC_TT = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT, as.data.frame(diffAnn_G2_infectedG1@members))
DMSlist$DMS_15pc_TC_TT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],
as.data.frame(myannot$G2G1ihypo@members))
DMSlist$DMS_15pc_TC_TT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],
as.data.frame(myannot$G2G1ihyper@members))
## Make Venn diagram for each feature
getFeatureDFHYPO <- function(myfeat){
a = DMSlist$DMS_15pc_G1_C_T_HYPO$pos[DMSlist$DMS_15pc_G1_C_T_HYPO$feature %in% myfeat]
b = DMSlist$DMS_15pc_CC_CT_HYPO$pos[DMSlist$DMS_15pc_CC_CT_HYPO$feature %in% myfeat]
c = DMSlist$DMS_15pc_TC_TT_HYPO$pos[DMSlist$DMS_15pc_TC_TT_HYPO$feature %in% myfeat]
return(list(a=a,b=b,c=c))
}
getFeatureDFHYPER <- function(myfeat){
a = DMSlist$DMS_15pc_G1_C_T_HYPER$pos[DMSlist$DMS_15pc_G1_C_T_HYPER$feature %in% myfeat]
b = DMSlist$DMS_15pc_CC_CT_HYPER$pos[DMSlist$DMS_15pc_CC_CT_HYPER$feature %in% myfeat]
c = DMSlist$DMS_15pc_TC_TT_HYPER$pos[DMSlist$DMS_15pc_TC_TT_HYPER$feature %in% myfeat]
return(list(a=a,b=b,c=c))
}
getVenn <- function(feat, direction){
if (direction == "hypo"){
ggVennDiagram(list(A = getFeatureDFHYPO(feat)[["a"]],
B = getFeatureDFHYPO(feat)[["b"]],
C = getFeatureDFHYPO(feat)[["c"]]), label_alpha = 0,
category.names = c(paste0("DMS G1\nhypo\n", feat), paste0("DMS G2-c\nhypo\n", feat), paste0("DMS G2-i\nhypo\n", feat))) +
scale_fill_gradient(low="white",high = "red")
} else if (direction == "hyper"){
ggVennDiagram(list(A = getFeatureDFHYPER(feat)[["a"]],
B = getFeatureDFHYPER(feat)[["b"]],
C = getFeatureDFHYPER(feat)[["c"]]), label_alpha = 0,
category.names = c(paste0("DMS G1\nhyper\n", feat), paste0("DMS G2-c\nhyper\n", feat), paste0("DMS G2-i\nhyper\n", feat))) +
scale_fill_gradient(low="white",high = "red")
}
}
ggarrange(getVenn("promoter", "hypo"), getVenn("exon", "hypo"),
getVenn("intron", "hypo"), getVenn("intergenic", "hypo"),
nrow = 2, ncol = 2)
ggarrange(getVenn("promoter", "hyper"), getVenn("exon", "hyper"),
getVenn("intron", "hyper"), getVenn("intergenic", "hyper"),
nrow = 2, ncol = 2)
## Parents trt-ctrl
# load annotation
annot_PAR <- as.data.frame(diffAnn_PAR@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T, annotFile = annot_PAR, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")
## G2-G1c trt-ctrl
# load annotation
annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT, annotFile = annot_G2_G1c, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")
## G2-G1i trt-ctrl
# load annotation
annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT, annotFile = annot_G2_G1i, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")
## Outliers in Manhattan plot: 15% diff + 2SD
outliers_G1_final <- which(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff)))
outliers_annot_G1 <- as.data.frame(diffAnn_PAR@members)[outliers_G1_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T[outliers_G1_final, ],
annotFile = outliers_annot_G1, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")
outliers_G2_G1c_final <- which(abs(DMSlist$DMS_15pc_CC_CT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_CC_CT$meth.diff)))
outliers_annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)[outliers_G2_G1c_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT[outliers_G2_G1c_final, ],
annotFile = outliers_annot_G2_G1c, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")
outliers_G2_G1i_final <- which(abs(DMSlist$DMS_15pc_TC_TT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_TC_TT$meth.diff)))
outliers_annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)[outliers_G2_G1i_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT[outliers_G2_G1i_final, ],
annotFile = outliers_annot_G2_G1i, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")
TBC To be continued
## All in DMSBPlist
#
# ## Add fam to bp
# names(DMSBPlist) <- paste0(plyr::join(data.frame(BP = names(DMSBPlist)),
# unique(data.frame(BP = fullMetadata_OFFS$brotherPairID,
# fam = fullMetadata_OFFS$Family)))[[2]], "_", names(DMSBPlist))
## Extract DMS (by position)
myPosList = lapply(DMSBPlist, lapply, function(x){paste(x$chr, x$end)})
## Find DMS present in at least 4 BP out of 8 (half):
get2keep = function(Compa){
x <- lapply(myPosList, function(x){unlist(x[[paste0("DMS_15pc_BP_", Compa)]])})
f <- table(unlist((x))) # each DMS present between 1 and 8 times
tokeep <- names(f)[f >= 4]
print(length(tokeep))
## Keep the DMS present in 4 families minimum
DMSBPlist_INTER4 <- lapply(x, function(x){x[x %in% tokeep]})
## Reorder by family:
DMSBPlist_INTER4 <- DMSBPlist_INTER4[names(DMSBPlist_INTER4)[order(names(DMSBPlist_INTER4))]]
return(DMSBPlist_INTER4)
}
## Prepare df for complexUpset
getUpsetDF = function(i){ # for a given comparison
A = get2keep(vecCompa[i])
A2 = lapply(A, function(x){
x = data.frame(x) # vector of DMS as df
names(x) = "DMS" # name each CpG
return(x)
})
## Add BP name
for (i in 1:length(names(A2))){
A2[[i]]["BP"] = names(A2)[i]
}
# make a dataframe
A2 = A2 %>% reduce(full_join, by = "DMS")
# names column with BP id
for (i in 2:ncol(A2)) {names(A2)[i] = unique(A2[!is.na(A2[i]), i])}
# replace by 0 or 1 the DMS absence/presence
A = data.frame(apply(A2[2:9], 2, function(x) ifelse(is.na(x), 0, 1)))
# add DMS
A$DMS = A2$DMS
return(A)
}
## Vector of all 4 comparisons
vecCompa <- c("CC_TC", "CT_TT", "CC_CT", "TC_TT")
## Make upset plots
for (i in 1:4){
df = getUpsetDF(i)
print(ComplexUpset::upset(
df,
names(df)[1:8],
width_ratio=0.1,
sort_intersections_by=c('degree', 'cardinality'),
queries=query_by_degree(
df, names(df)[1:8],
params_by_degree=list(
'1'=list(color='red', fill='red'),
'2'=list(color='purple', fill='purple'),
'3'=list(color='blue', fill='blue'),
'4'=list(color='grey', fill='grey'),
'5'=list(color='red', fill='red'),
'6'=list(color='purple', fill='purple'),
'7'=list(color='blue', fill='blue'),
'8'=list(color='green', fill='green')
),
only_components=c("intersections_matrix", "Intersection size")
)))
}
## [1] 1099
## [1] 1257
## [1] 329
## [1] 341
plotManhattanGenes <- function(i){
print(paste0("The number of DMS in the ", vecCompa[i] ," comparison is:"))
DMSvec = unique(unlist(get2keep(vecCompa[i])))
# Annotate the DMS present in at least 4 BP:
# Change the vector into a methobject:
df <- data.frame(chr=sapply(strsplit(DMSvec, " "), `[`, 1),
start=sapply(strsplit(DMSvec, " "), `[`, 2),
end=sapply(strsplit(DMSvec, " "), `[`, 2))
# get annotation
anot4BP <- getAnnotationFun(makeGRangesFromDataFrame(df))
print(paste0("The number of genes with DMS in the ", vecCompa[i] ," comparison is:"))
print(nrow(anot4BP)) # number of genes
# Reorder chromosome
anot4BP <- anot4BP %>%
mutate(chr = gsub("Gy_chr", "", seqid), chrom_nr = seqid %>% deroman(), chrom_order=factor(chrom_nr) %>%
as.numeric()) %>% arrange(chrom_order) %>%
mutate(geneLengthkb = (end - start)/1000, nCpGperGenekb = nCpG/geneLengthkb)
# Plot
plot = ggplot(anot4BP, aes(x=start, y = nCpGperGenekb)) + geom_point(alpha=.4) +
facet_grid(.~fct_inorder(chr)) +
geom_hline(yintercept = 1)+
theme(axis.text.x=element_blank()) +
xlab(paste0("Genes with DMS present in at least 4 brother pairs\nComparison: ", vecCompa[i])) +
ylab("Number of differentially methylated CpG per gene kb")
## Genes with at least 1 CpG differentially methylated per kb:
topGenes = anot4BP[anot4BP$nCpGperGenekb >= 1,]
return(list(plot = plot, anot4BP = anot4BP, topGenes = topGenes))
}
plotManhattanGenes(1)$plot
## [1] "The number of DMS in the CC_TC comparison is:"
## [1] 1099
## [1] "The number of genes with DMS in the CC_TC comparison is:"
## [1] 432
plotManhattanGenes(2)$plot
## [1] "The number of DMS in the CT_TT comparison is:"
## [1] 1257
## [1] "The number of genes with DMS in the CT_TT comparison is:"
## [1] 547
plotManhattanGenes(3)$plot
## [1] "The number of DMS in the CC_CT comparison is:"
## [1] 329
## [1] "The number of genes with DMS in the CC_CT comparison is:"
## [1] 144
plotManhattanGenes(4)$plot
## [1] "The number of DMS in the TC_TT comparison is:"
## [1] 341
## [1] "The number of genes with DMS in the TC_TT comparison is:"
## [1] 171
# create gene universe
gene_universe <- data.frame(
subsetByOverlaps(GRanges(annotGff3), GRanges(uniteCov14_G2_woSexAndUnknowChrOVERLAP))) %>% # subselect covered CpGs
filter(lengths(Ontology_term)!=0) %>% # rm non existing GO terms
filter(type %in% "gene") %>% # keep all the 7416 genes with GO terms
dplyr::select(c("Name", "Ontology_term")) %>%
mutate(go_linkage_type = "IEA") %>% #NB: IEA but not necessarily true, it's from Interproscan after Maker. Sticklebacks (biomart) have 82701 IEA and 63 ISS.
relocate("Ontology_term","go_linkage_type","Name") %>%
unnest(Ontology_term) %>% # one GO per line (was a list before in this column)
data.frame()
# Create gene set collection
goFrame <- GOFrame(gene_universe, organism="Gasterosteus aculeatus")
goAllFrame <- GOAllFrame(goFrame)
gsc_universe <- GeneSetCollection(goAllFrame, setType = GOCollection())
IMPORTANT NOTE from Mel: why conditional hypergeometric test? The GO ontology is set up as a directed acyclic graph, where a parent term is comprised of all its child terms. If you do a standard hypergeometric, you might e.g., find ‘positive regulation of kinase activity’ to be significant. If you then test ‘positive regulation of catalytic activity’, which is a parent term, then it might be significant as well, but only because of the terms coming from positive regulation of kinase activity.
The conditional hypergeometric takes this into account, and only uses those terms that were not already significant when testing a higher order (parent) term.
## For each comparison:
A=makedfGO(1, gene_universe); B=makedfGO(2, gene_universe); C=makedfGO(3, gene_universe); D=makedfGO(4, gene_universe)
## [1] "The number of DMS in the CC_TC comparison is:"
## [1] 1099
## [1] "The number of genes with DMS in the CC_TC comparison is:"
## [1] 432
## [1] "The number of DMS in the CT_TT comparison is:"
## [1] 1257
## [1] "The number of genes with DMS in the CT_TT comparison is:"
## [1] 547
## [1] "The number of DMS in the CC_CT comparison is:"
## [1] 329
## [1] "The number of genes with DMS in the CC_CT comparison is:"
## [1] 144
## [1] "The number of DMS in the TC_TT comparison is:"
## [1] 341
## [1] "The number of genes with DMS in the TC_TT comparison is:"
## [1] 171
dfGO = rbind(A, B, C, D)
# add type of comparison:
dfGO$group = "Different parental treatment"
dfGO$group[dfGO$Comp %in% c("CC_CT", "TC_TT")] = "Different offpring treatment"
dfGO %>% ggplot(aes(x=Comp, y = factor(GO.name))) +
geom_point(aes(color = p.value.adjusted, size = genePercent)) +
scale_color_gradient(name="adjusted\np-value", low = "red", high = "blue") +
scale_size_continuous(name = "% of genes")+
theme_bw() + ylab("") + xlab("Treatments comparison") +
theme(legend.box.background = element_rect(fill = "#ebebeb", color = "#ebebeb"),
legend.background = element_rect(fill = "#ebebeb", color = "#ebebeb"),
legend.key = element_rect(fill = "#ebebeb", color = "#ebebeb"), legend.position="left") + # grey box for legend
facet_grid(fct_inorder(GO.category)~group, scales="free",space = "free")
getGeneSummary <- function(i){
geneNotes = plotManhattanGenes(i)$topGenes$Note
# extract uniprot symbol from note, then uppercase
genes = sapply(geneNotes, function(x) sub("Similar to ", "", x) %>% str_extract(".*:") %>% str_remove(":") %>% toupper)
# Convert the uniprot gene names to entrez ids
topGenesDF = unlist(mapIds(org.Hs.eg.db, keys = genes, column = "ENTREZID", keytype = "SYMBOL")) %>% data.frame()
names(topGenesDF) = "ENTREZID" ; topGenesDF$GeneSymbol = rownames(topGenesDF) ; rownames(topGenesDF) = NULL
# Retrieve gene summary & descirption
genes = entrez_summary(db="gene", id=topGenesDF$ENTREZID)
SummaDF = lapply(genes, function(x) x[["summary"]]) %>% unlist() %>% data.frame()
names(SummaDF) = "Summary" ; SummaDF$ENTREZID = rownames(SummaDF) ; rownames(SummaDF) = NULL
DescDF = lapply(genes, function(x) x[["description"]]) %>% unlist() %>% data.frame()
names(DescDF) = "description" ; DescDF$ENTREZID = rownames(DescDF) ; rownames(DescDF) = NULL
# Output complete table
topGenesDF = merge(merge(DescDF, topGenesDF, all = T), SummaDF, all = T)
# Add which comparison
topGenesDF$comparison = vecCompa[i]
return(topGenesDF)
}
[1] “The number of DMS in the CC_TC comparison is:” [1] 1099 [1] “The number of genes with DMS in the CC_TC comparison is:” [1] 432
| ENTREZID | description | GeneSymbol | Summary | comparison |
|---|---|---|---|---|
| 10873 | malic enzyme 3 | ME3 | Malic enzyme catalyzes the oxidative decarboxylation of malate to pyruvate using either NAD+ or NADP+ as a cofactor. Mammalian tissues contain 3 distinct isoforms of malic enzyme: a cytosolic NADP(+)-dependent isoform, a mitochondrial NADP(+)-dependent isoform, and a mitochondrial NAD(+)-dependent isoform. This gene encodes a mitochondrial NADP(+)-dependent isoform. Multiple alternatively spliced transcript variants have been found for this gene, but the biological validity of some variants has not been determined. [provided by RefSeq, Jul 2008] | CC_TC |
| 10887 | prokineticin receptor 1 | PROKR1 | This gene encodes a member of the G-protein-coupled receptor family. The encoded protein binds to prokineticins (1 and 2), leading to the activation of MAPK and STAT signaling pathways. Prokineticins are protein ligands involved in angiogenesis and inflammation. The encoded protein is expressed in peripheral tissues such as those comprising the circulatory system, lungs, reproductive system, endocrine system and the gastrointestinal system. The protein may be involved in signaling in human fetal ovary during initiation of primordial follicle formation. Sequence variants in this gene may be associated with recurrent miscarriage. [provided by RefSeq, Aug 2016] | CC_TC |
| 11135 | CDC42 effector protein 1 | CDC42EP1 | CDC42 is a member of the Rho GTPase family that regulates multiple cellular activities, including actin polymerization. The protein encoded by this gene is a CDC42 binding protein that mediates actin cytoskeleton reorganization at the plasma membrane. This protein is secreted and is primarily found in bone marrow. [provided by RefSeq, Jul 2008] | CC_TC |
| 221421 | radial spoke head component 9 | RSPH9 | This gene encodes a protein thought to be a component of the radial spoke head in motile cilia and flagella. Mutations in this gene are associated with primary ciliary dyskinesia 12. Alternative splicing results in multiple transcript variants.[provided by RefSeq, Jul 2010] | CC_TC |
| 2287 | FKBP prolyl isomerase 3 | FKBP3 | The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] | CC_TC |
| 2831 | neuropeptides B and W receptor 1 | NPBWR1 | CC_TC | |
| 3040 | hemoglobin subunit alpha 2 | HBA2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CC_TC |
| 3763 | potassium inwardly rectifying channel subfamily J member 6 | KCNJ6 | This gene encodes a member of the G protein-coupled inwardly-rectifying potassium channel family of inward rectifier potassium channels. This type of potassium channel allows a greater flow of potassium into the cell than out of it. These proteins modulate many physiological processes, including heart rate in cardiac cells and circuit activity in neuronal cells, through G-protein coupled receptor stimulation. Mutations in this gene are associated with Keppen-Lubinsky Syndrome, a rare condition characterized by severe developmental delay, facial dysmorphism, and intellectual disability. [provided by RefSeq, Apr 2015] | CC_TC |
| 3982 | lens intrinsic membrane protein 2 | LIM2 | This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] | CC_TC |
| 407738 | TAFA chemokine like family member 1 | TAFA1 | This gene is a member of the TAFA family which is composed of five highly homologous genes that encode small secreted proteins. These proteins contain conserved cysteine residues at fixed positions, and are distantly related to MIP-1alpha, a member of the CC-chemokine family. The TAFA proteins are predominantly expressed in specific regions of the brain, and are postulated to function as brain-specific chemokines or neurokines that act as regulators of immune and nervous cells. [provided by RefSeq, Jul 2008] | CC_TC |
| 4099 | myelin associated glycoprotein | MAG | The protein encoded by this gene is a type I membrane protein and member of the immunoglobulin superfamily. It is thought to be involved in the process of myelination. It is a lectin that binds to sialylated glycoconjugates and mediates certain myelin-neuron cell-cell interactions. Three alternatively spliced transcripts encoding different isoforms have been described for this gene. [provided by RefSeq, Nov 2010] | CC_TC |
| 440738 | microtubule associated protein 1 light chain 3 gamma | MAP1LC3C | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CC_TC |
| 51305 | potassium two pore domain channel subfamily K member 9 | KCNK9 | This gene encodes a protein that contains multiple transmembrane regions and two pore-forming P domains and functions as a pH-dependent potassium channel. Amplification and overexpression of this gene have been observed in several types of human carcinomas. This gene is imprinted in the brain, with preferential expression from the maternal allele. A mutation in this gene was associated with Birk-Barel dysmorphism syndrome. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2017] | CC_TC |
| 51647 | cytosolic iron-sulfur assembly component 2B | CIAO2B | CC_TC | |
| 55603 | terminal nucleotidyltransferase 5A | TENT5A | CC_TC | |
| 64747 | major facilitator superfamily domain containing 1 | MFSD1 | CC_TC | |
| 650 | bone morphogenetic protein 2 | BMP2 | This gene encodes a secreted ligand of the TGF-beta (transforming growth factor-beta) superfamily of proteins. Ligands of this family bind various TGF-beta receptors leading to recruitment and activation of SMAD family transcription factors that regulate gene expression. The encoded preproprotein is proteolytically processed to generate each subunit of the disulfide-linked homodimer, which plays a role in bone and cartilage development. Duplication of a regulatory region downstream of this gene causes a form of brachydactyly characterized by a malformed index finger and second toe in human patients. [provided by RefSeq, Jul 2016] | CC_TC |
| 7431 | vimentin | VIM | This gene encodes a type III intermediate filament protein. Intermediate filaments, along with microtubules and actin microfilaments, make up the cytoskeleton. The encoded protein is responsible for maintaining cell shape and integrity of the cytoplasm, and stabilizing cytoskeletal interactions. This protein is involved in neuritogenesis and cholesterol transport and functions as an organizer of a number of other critical proteins involved in cell attachment, migration, and signaling. Bacterial and viral pathogens have been shown to attach to this protein on the host cell surface. Mutations in this gene are associated with congenital cataracts in human patients. [provided by RefSeq, Aug 2017] | CC_TC |
| 7528 | YY1 transcription factor | YY1 | YY1 is a ubiquitously distributed transcription factor belonging to the GLI-Kruppel class of zinc finger proteins. The protein is involved in repressing and activating a diverse number of promoters. YY1 may direct histone deacetylases and histone acetyltransferases to a promoter in order to activate or repress the promoter, thus implicating histone modification in the function of YY1. [provided by RefSeq, Jul 2008] | CC_TC |
| 84163 | GTF2I repeat domain containing 2 | GTF2IRD2 | This gene is one of several closely related genes on chromosome 7 encoding proteins containing helix-loop-helix motifs. These proteins may function as regulators of transcription. The encoded protein is unique in that its C-terminus is derived from CHARLIE8 transposable element sequence. This gene is located in a region of chromosome 7 that is deleted in Williams-Beuren syndrome, and loss of this locus may contribute to the cognitive phenotypes observed in this disease. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2013] | CC_TC |
| 8492 | serine protease 12 | PRSS12 | This gene encodes a member of the trypsin family of serine proteases and contains a signal peptide, a proline-rich region, a Kringle domain, four scavenger receptor cysteine-rich domains, and a trypsin-like serine protease domain. The protein, sometimes referred to as neurotrypsin or motopsin, is secreted from neuronal cells and localizes to the synaptic cleft. Studies in mice show that this protein cleaves a protein, agrin, that is important for the formation and maintenance of exitatory synapses. Defects in this gene cause a form of autosomal recessive cognitive impairment (MRT1). [provided by RefSeq, Jul 2017] | CC_TC |
| 8526 | diacylglycerol kinase epsilon | DGKE | Diacylglycerol kinases are thought to be involved mainly in the regeneration of phosphatidylinositol (PI) from diacylglycerol in the PI-cycle during cell signal transduction. When expressed in mammalian cells, DGK-epsilon shows specificity for arachidonyl-containing diacylglycerol. DGK-epsilon is expressed predominantly in testis. [provided by RefSeq, Jul 2008] | CC_TC |
| 9403 | selenoprotein F | SELENOF | The protein encoded by this gene belongs to the SEP15/selenoprotein M family. The exact function of this protein is not known; however, it has been found to associate with UDP-glucose:glycoprotein glucosyltransferase (UGTR), an endoplasmic reticulum(ER)-resident protein, which is involved in the quality control of protein folding. The association with UGTR retains this protein in the ER, where it may play a role in protein folding. It has also been suggested to have a role in cancer etiology. This protein is a selenoprotein, containing the rare amino acid selenocysteine (Sec). Sec is encoded by the UGA codon, which normally signals translation termination. The 3’ UTRs of selenoprotein mRNAs contain a conserved stem-loop structure, designated the Sec insertion sequence (SECIS) element, that is necessary for the recognition of UGA as a Sec codon, rather than as a stop signal. Alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Nov 2016] | CC_TC |
| NA | NA | RASGEF1BB | NA | CC_TC |
[1] “The number of DMS in the CT_TT comparison is:” [1] 1257 [1] “The number of genes with DMS in the CT_TT comparison is:” [1] 547
| ENTREZID | description | GeneSymbol | Summary | comparison |
|---|---|---|---|---|
| 114794 | extracellular leucine rich repeat and fibronectin type III domain containing 2 | ELFN2 | CT_TT | |
| 140838 | N-acetylneuraminic acid phosphatase | NANP | CT_TT | |
| 145864 | hyaluronan and proteoglycan link protein 3 | HAPLN3 | This gene belongs to the hyaluronan and proteoglycan binding link protein gene family. The protein encoded by this gene may function in hyaluronic acid binding and cell adhesion. [provided by RefSeq, Jul 2008] | CT_TT |
| 150383 | cysteine rich DPF motif domain containing 1 | CDPF1 | CT_TT | |
| 158787 | RIB43A domain with coiled-coils 1 | RIBC1 | CT_TT | |
| 170591 | S100 calcium binding protein Z | S100Z | Members of the S100 protein family contain 2 calcium-binding EF-hands and exhibit cell-type specific expression patterns. For additional background information on S100 proteins, see MIM 114085.[supplied by OMIM, Mar 2008] | CT_TT |
| 221391 | opsin 5 | OPN5 | Opsins are members of the guanine nucleotide-binding protein (G protein)-coupled receptor superfamily. This opsin gene is expressed in the eye, brain, testes, and spinal cord. This gene belongs to the seven-exon subfamily of mammalian opsin genes that includes peropsin (RRH) and retinal G protein coupled receptor (RGR). Like these other seven-exon opsin genes, this family member may encode a protein with photoisomerase activity. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jun 2010] | CT_TT |
| 2287 | FKBP prolyl isomerase 3 | FKBP3 | The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] | CT_TT |
| 2535 | frizzled class receptor 2 | FZD2 | This intronless gene is a member of the frizzled gene family. Members of this family encode seven-transmembrane domain proteins that are receptors for the wingless type MMTV integration site family of signaling proteins. This gene encodes a protein that is coupled to the beta-catenin canonical signaling pathway. Competition between the wingless-type MMTV integration site family, member 3A and wingless-type MMTV integration site family, member 5A gene products for binding of this protein is thought to regulate the beta-catenin-dependent and -independent pathways. [provided by RefSeq, Dec 2010] | CT_TT |
| 26164 | mitochondrial ribosome associated GTPase 2 | MTG2 | Small G proteins, such as GTPBP5, act as molecular switches that play crucial roles in the regulation of fundamental cellular processes such as protein synthesis, nuclear transport, membrane trafficking, and signal transduction (Hirano et al., 2006 [PubMed 17054726]).[supplied by OMIM, Mar 2008] | CT_TT |
| 26531 | olfactory receptor family 11 subfamily A member 1 | OR11A1 | Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] | CT_TT |
| 29094 | galectin like | LGALSL | CT_TT | |
| 3040 | hemoglobin subunit alpha 2 | HBA2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CT_TT |
| 4056 | leukotriene C4 synthase | LTC4S | The MAPEG (Membrane Associated Proteins in Eicosanoid and Glutathione metabolism) family includes a number of human proteins, several of which are involved the production of leukotrienes. This gene encodes an enzyme that catalyzes the first step in the biosynthesis of cysteinyl leukotrienes, potent biological compounds derived from arachidonic acid. Leukotrienes have been implicated as mediators of anaphylaxis and inflammatory conditions such as human bronchial asthma. This protein localizes to the nuclear envelope and adjacent endoplasmic reticulum. [provided by RefSeq, Jul 2008] | CT_TT |
| 4160 | melanocortin 4 receptor | MC4R | The protein encoded by this gene is a membrane-bound receptor and member of the melanocortin receptor family. The encoded protein interacts with adrenocorticotropic and MSH hormones and is mediated by G proteins. This is an intronless gene. Defects in this gene are a cause of autosomal dominant obesity. [provided by RefSeq, Jan 2010] | CT_TT |
| 440738 | microtubule associated protein 1 light chain 3 gamma | MAP1LC3C | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CT_TT |
| 4736 | ribosomal protein L10a | RPL10A | Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L1P family of ribosomal proteins. It is located in the cytoplasm. The expression of this gene is downregulated in the thymus by cyclosporin-A (CsA), an immunosuppressive drug. Studies in mice have shown that the expression of the ribosomal protein L10a gene is downregulated in neural precursor cells during development. This gene previously was referred to as NEDD6 (neural precursor cell expressed, developmentally downregulated 6), but it has been renamed RPL10A (ribosomal protein 10a). As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. [provided by RefSeq, Jul 2008] | CT_TT |
| 51647 | cytosolic iron-sulfur assembly component 2B | CIAO2B | CT_TT | |
| 5251 | phosphate regulating endopeptidase homolog X-linked | PHEX | The protein encoded by this gene is a transmembrane endopeptidase that belongs to the type II integral membrane zinc-dependent endopeptidase family. The protein is thought to be involved in bone and dentin mineralization and renal phosphate reabsorption. Mutations in this gene cause X-linked hypophosphatemic rickets. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Sep 2013] | CT_TT |
| 55584 | cholinergic receptor nicotinic alpha 9 subunit | CHRNA9 | This gene is a member of the ligand-gated ionic channel family and nicotinic acetylcholine receptor gene superfamily. It encodes a plasma membrane protein that forms homo- or hetero-oligomeric divalent cation channels. This protein is involved in cochlea hair cell development and is also expressed in the outer hair cells (OHCs) of the adult cochlea. [provided by RefSeq, Feb 2012] | CT_TT |
| 5603 | mitogen-activated protein kinase 13 | MAPK13 | This gene encodes a member of the mitogen-activated protein (MAP) kinase family. MAP kinases act as an integration point for multiple biochemical signals, and are involved in a wide variety of cellular processes such as proliferation, differentiation, transcription regulation and development. The encoded protein is a p38 MAP kinase and is activated by proinflammatory cytokines and cellular stress. Substrates of the encoded protein include the transcription factor ATF2 and the microtubule dynamics regulator stathmin. Alternatively spliced transcript variants have been observed for this gene. [provided by RefSeq, Jul 2012] | CT_TT |
| 83641 | family with sequence similarity 107 member B | FAM107B | CT_TT | |
| 85460 | zinc finger protein 518B | ZNF518B | CT_TT | |
| 85462 | FH2 domain containing 1 | FHDC1 | CT_TT | |
| 9775 | eukaryotic translation initiation factor 4A3 | EIF4A3 | This gene encodes a member of the DEAD box protein family. DEAD box proteins, characterized by the conserved motif Asp-Glu-Ala-Asp (DEAD), are putative RNA helicases. They are implicated in a number of cellular processes involving alteration of RNA secondary structure, such as translation initiation, nuclear and mitochondrial splicing, and ribosome and spliceosome assembly. Based on their distribution patterns, some members of this family are believed to be involved in embryogenesis, spermatogenesis, and cellular growth and division. The protein encoded by this gene is a nuclear matrix protein. Its amino acid sequence is highly similar to the amino acid sequences of the translation initiation factors eIF4AI and eIF4AII, two other members of the DEAD box protein family. [provided by RefSeq, Jul 2008] | CT_TT |
| NA | NA | MNCB-2990 | NA | CT_TT |
| NA | NA | ACBD5A | NA | CT_TT |
| NA | NA | HOXB9A | NA | CT_TT |
[1] “The number of DMS in the CC_CT comparison is:” [1] 329 [1] “The number of genes with DMS in the CC_CT comparison is:” [1] 144
| ENTREZID | description | GeneSymbol | Summary | comparison |
|---|---|---|---|---|
| 10626 | tripartite motif containing 16 | TRIM16 | The protein encoded by this gene is a tripartite motif (TRIM) family member that contains two B box domains and a coiled-coiled region that are characteristic of the B box zinc finger protein family. While it lacks a RING domain found in other TRIM proteins, the encoded protein can homodimerize or heterodimerize with other TRIM proteins and has E3 ubiquitin ligase activity. This gene is also a tumor suppressor and is involved in secretory autophagy. [provided by RefSeq, Jan 2017] | CC_CT |
| 1113 | chromogranin A | CHGA | The protein encoded by this gene is a member of the chromogranin/secretogranin family of neuroendocrine secretory proteins. It is found in secretory vesicles of neurons and endocrine cells. This gene product is a precursor to three biologically active peptides; vasostatin, pancreastatin, and parastatin. These peptides act as autocrine or paracrine negative modulators of the neuroendocrine system. Two other peptides, catestatin and chromofungin, have antimicrobial activity and antifungal activity, respectively. Two transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Sep 2014] | CC_CT |
| 150383 | cysteine rich DPF motif domain containing 1 | CDPF1 | CC_CT | |
| 221078 | NOP2/Sun RNA methyltransferase 6 | NSUN6 | CC_CT | |
| 26531 | olfactory receptor family 11 subfamily A member 1 | OR11A1 | Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] | CC_CT |
| 3040 | hemoglobin subunit alpha 2 | HBA2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CC_CT |
| 440738 | microtubule associated protein 1 light chain 3 gamma | MAP1LC3C | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CC_CT |
| 51647 | cytosolic iron-sulfur assembly component 2B | CIAO2B | CC_CT | |
| 55040 | epsin 3 | EPN3 | CC_CT | |
| 84513 | phospholipid phosphatase 5 | PLPP5 | CC_CT |
[1] “The number of DMS in the TC_TT comparison is:” [1] 341 [1] “The number of genes with DMS in the TC_TT comparison is:” [1] 171
| ENTREZID | description | GeneSymbol | Summary | comparison |
|---|---|---|---|---|
| 26001 | ring finger protein 167 | RNF167 | RNF167 is an E3 ubiquitin ligase that interacts with TSSC5 (SLC22A18; MIM 602631) and, together with UBCH6 (UBE2E1; MIM 602916), facilitates TSSC5 polyubiquitylation (Yamada and Gorbsky, 2006 [PubMed 16314844]).[supplied by OMIM, Mar 2008] | TC_TT |
| 3982 | lens intrinsic membrane protein 2 | LIM2 | This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] | TC_TT |
| 4247 | alpha-1,6-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase | MGAT2 | The product of this gene is a Golgi enzyme catalyzing an essential step in the conversion of oligomannose to complex N-glycans. The enzyme has the typical glycosyltransferase domains: a short N-terminal cytoplasmic domain, a hydrophobic non-cleavable signal-anchor domain, and a C-terminal catalytic domain. Mutations in this gene may lead to carbohydrate-deficient glycoprotein syndrome, type II. The coding region of this gene is intronless. Transcript variants with a spliced 5’ UTR may exist, but their biological validity has not been determined. [provided by RefSeq, Jul 2008] | TC_TT |
| 440738 | microtubule associated protein 1 light chain 3 gamma | MAP1LC3C | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | TC_TT |
| 51058 | zinc finger protein 691 | ZNF691 | TC_TT | |
| 6158 | ribosomal protein L28 | RPL28 | Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L28E family of ribosomal proteins. It is located in the cytoplasm. Variable expression of this gene in colorectal cancers compared to adjacent normal tissues has been observed, although no correlation between the level of expression and the severity of the disease has been found. As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. Alternative splicing results in multiple transcript variants encoding distinct isoforms.[provided by RefSeq, Oct 2008] | TC_TT |
| 64866 | CUB domain containing protein 1 | CDCP1 | This gene encodes a transmembrane protein which contains three extracellular CUB domains and acts as a substrate for Src family kinases. The protein plays a role in the tyrosine phosphorylation-dependent regulation of cellular events that are involved in tumor invasion and metastasis. Alternative splicing results in multiple transcript variants of this gene. [provided by RefSeq, May 2013] | TC_TT |
| 85460 | zinc finger protein 518B | ZNF518B | TC_TT |
##############################################################
#### I. Focus on CpG positions at parental (Ctrl-Inf) DMS ####
##############################################################
####################################################################################
#### Question: how are the beta values in the different groups at the parental DMS?##
####################################################################################
##############
## Prepare dataset
##############
PM_G1 <- getPMdataset(uniteCov = uniteCov6_G1_woSexAndUnknowChrOVERLAP, MD = fullMetadata_PAR, gener="parents")
PM_G2 <- getPMdataset(uniteCov = uniteCov14_G2_woSexAndUnknowChrOVERLAP, MD = fullMetadata_OFFS, gener="offspring")
head(PM_G1)
head(PM_G2)
table(fullMetadata_OFFS$trtG1G2, fullMetadata_OFFS$clutch.ID)
## What is the relative contribution of methylation to brother pair & paternal treatment?
## Test of VCA: variance component analysis https://cran.r-project.org/web/packages/VCA/vignettes/VCA_package_vignette.html
## Hypo
PM_G2_mean_hypo <- PM_G2[PM_G2$hypohyper %in% "hypo", ] %>%
group_by(brotherPairID, G1_trt, G2_trt, ID) %>%
dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()
varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo,
MeanLine=list(var=c("G1_trt", "G2_trt"),
col=c("white", "blue"), lwd=c(2,2)),
BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypomethylated upon infection"))
myfitVCA_hypo <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo)
### Real values
trtEffect <- sum(myfitVCA_hypo$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hypo$aov.tab[5:8, 5])
error <- sum(myfitVCA_hypo$aov.tab[9, 5])
realValHypoVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)
### Randomisation
myrandomVCA <- function(df=PM_G2_mean_hypo){
randomDF = df
randomDF$G1_trt = sample(PM_G2_mean_hypo$G1_trt, replace = F)
randomDF$G2_trt = sample(PM_G2_mean_hypo$G2_trt, replace = F)
randomDF$brotherPairID = sample(PM_G2_mean_hypo$brotherPairID, replace = F)
myfitVCA <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = randomDF)
trtEffect <- sum(myfitVCA$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA$aov.tab[5:8, 5])
error <- sum(myfitVCA$aov.tab[9, 5])
return(data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error))
}
randomHypoVCA = do.call(rbind, lapply(1:1000, function(x) {
df=myrandomVCA(PM_G2_mean_hypo)
df$rep=x
return(df)}))
randomHypoVCA = melt(randomHypoVCA, id.vars = "rep")
# saveRDS(randomHypoVCA, file = "Rdata/randomHypoVCA.RDS")
randomHypoVCA <- readRDS(file = "Rdata/randomHypoVCA.RDS")
df2=reshape2::melt(realValHypoVCA)
sumDF <- randomHypoVCA %>%
group_by(variable) %>%
dplyr::summarize(value = mean(value)) %>% data.frame()
ggplot(randomHypoVCA, aes(x=variable, y=value))+
geom_boxplot()+
geom_jitter(width=.1, alpha=.2)+
geom_point(data = df2, col = "red", size = 6)+
geom_text(data=sumDF, aes(label=round(value)), col="white")+
geom_text(data = df2, aes(label=round(value)), col="white")+
theme_cleveland()+
ggtitle("VCA with bootstrap N=1000 at hypo-parDMS", subtitle = "red: observed values")
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)
## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>%
group_by(brotherPairID, G1_trt, G2_trt, ID) %>%
dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()
varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper,
MeanLine=list(var=c("G1_trt", "G2_trt"),
col=c("white", "blue"), lwd=c(2,2)),
BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))
myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper)
### Real values
trtEffect <- sum(myfitVCA_hyper$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hyper$aov.tab[5:8, 5])
error <- sum(myfitVCA_hyper$aov.tab[9, 5])
realValHyperVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)
### Randomisation
randomHyperVCA = do.call(rbind, lapply(1:1000, function(x) {
df=myrandomVCA(PM_G2_mean_hyper)
df$rep=x
return(df)}))
randomHyperVCA = melt(randomHyperVCA, id.vars = "rep")
# saveRDS(randomHyperVCA, file = "Rdata/randomHyperVCA.RDS")
randomHyperVCA <- readRDS(file = "Rdata/randomHyperVCA.RDS")
df2=reshape2::melt(realValHyperVCA)
sumDF <- randomHyperVCA %>%
group_by(variable) %>%
dplyr::summarize(value = mean(value)) %>% data.frame()
ggplot(randomHyperVCA, aes(x=variable, y=value))+
geom_boxplot()+
geom_jitter(width=.1, alpha=.2)+
geom_point(data = df2, col = "red", size = 6)+
geom_text(data=sumDF, aes(label=round(value)), col="white")+
geom_text(data = df2, aes(label=round(value)), col="white")+
theme_cleveland()+
ggtitle("VCA with bootstrap N=1000 at hyper-parDMS", subtitle = "red: observed values")
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)
################
## Hyper
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>%
group_by(brotherPairID, G1_trt, G2_trt, ID) %>%
dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()
varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper,
MeanLine=list(var=c("G1_trt", "G2_trt"),
col=c("white", "blue"), lwd=c(2,2)),
BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))
myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper)
print(myfitVCA_hyper, digits=4)
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hyper, VarVC=TRUE)
##############
## In parents
##############
parmod <- lmer(data = PM_G1, BetaValue ~ meth.diff.parentals : Treatment + (1|CpGSite) + (1|brotherPairID))
## check normality of residuals assumption
qqnorm(resid(parmod))
qqline(resid(parmod))
pred <- ggpredict(parmod, terms = c("meth.diff.parentals", "Treatment"))
plot(pred, add.data = T)+
scale_color_manual(values = c("black", "red"))+
scale_y_continuous(name = "Beta values")+
scale_x_continuous(name = "Methylation difference between infected and control parents in percentage")+
ggtitle("Predicted methylation ratio (Beta) values in parents\n as a function of differential methylation between exposed and control groups")+
theme_bw()
##############
## Linear model: does the beta value of offspring at DMS depends on treatment Parent x Offspring?
##############
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2, REML = F) # REML =F for model comparison
mod_noG1trt <- lmer(BetaValue ~ G2_trt:hypohyper + (1|CpGSite)+ (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noG2trt <-lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noInteractions <- lmer(BetaValue ~ (G1_trt + G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noHypoHyper <- lmer(BetaValue ~ (G1_trt * G2_trt) + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
## check normality of residuals assumption
qqnorm(resid(modFull))
qqline(resid(modFull))
## Likelihood ratio tests for all variables:
lrtest(modFull, mod_noG1trt) # G1 trt is VERY VERY significant (LRT: χ² (4) = 1163.6, p < 0.001)
lrtest(modFull, mod_noG2trt) # G2 trt is VERY VERY significant (LRT: χ² (4) = 30.02, p < 0.001) NB that changed when brotherpair is used instead of family!
lrtest(modFull, mod_noInteractions) # interactions are significant (LRT: χ² (2) = 9.21, p < 0.01)
lrtest(modFull, mod_noHypoHyper) # hypo/hyper VERY VERY significant (LRT: χ² (4) = 1140, p < 0.001)
## Post-hoc tests between treatments
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2)
modFull_emmeans <- emmeans(modFull, list(pairwise ~ (G1_trt:G2_trt):hypohyper), adjust = "tukey")
modFull_emmeans
P1 <- plot(modFull_emmeans, by = "hypohyper", comparisons = TRUE) +
# coord_flip()+
theme_bw() +
ggtitle("Estimated marginal means of methylation ratio (beta)\n of offspring at parental DMS")+
theme(legend.position = "none", axis.title.x = element_blank()) +
scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))
## NB: Comparison arrows: https://cran.r-project.org/web/packages/emmeans/vignettes/xplanations.html
## two estimated marginal means (EMMs) differ significantly if, and only if, their respective comparison arrows do not overlap
## These comparison arrows are decidedly not the same as confidence intervals.
## Confidence intervals for EMMs are based on the statistical properties of the individual EMMs, whereas comparison arrows
## are based on the statistical properties of differences of EMMs.
## Add the PARENTAL DMS value
## Same test on ALL, G1 and G2 fish
modFullG1 <- lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|brotherPairID), data = PM_G1)
modFullG1_emmeans <- emmeans(modFullG1, list(pairwise ~ G1_trt:hypohyper), adjust = "tukey")
modFullG1_emmeans
P2 <- plot(modFullG1_emmeans, by = "hypohyper", comparisons = TRUE) +
theme_bw() +
ggtitle("Estimated marginal means of methylation ratio (beta)\n of parents at DMS")+
theme(legend.position = "none", axis.title.x = element_blank()) +
scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))
ggarrange(P2, P1, labels = c("A", "B"), ncol = 1, nrow = 2)
#####################################################################################
## Are the nbr of residuals methylation AT PARENTAL DMS different in the 4 G2 trt? ##
## (for hypo vs hypermeth)? ##
##############################
length(unique(PM_G1$CpGSite))# 3648 positions
PM_G1 %>% dplyr::count(ID)## NB: not all covered in all samples
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hypo"]))# 1176 positions hypomethylated upon parental inf
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hyper"]))# 2472 positions hypermethylated upon parental inf
myfun <- function(X){
## Calculate nbr of CpG hypo/hypermethylated per individual, and nbr of covered CpG:
X <- X %>% group_by(ID, Treatment, brotherPairID, clutch.ID, Sex) %>%
dplyr::summarise(ncov = n(),
hypoMeth = sum(BetaValue < 0.3),
hyperMeth = sum(BetaValue > 0.7)) %>% data.frame()
# Calculate residuals of nbr of methCpG by nbr of covered CpG
X$res_Nbr_methCpG_Nbr_coveredCpG_HYPO = residuals(lm(X$hypoMeth ~ X$ncov))
X$res_Nbr_methCpG_Nbr_coveredCpG_HYPER = residuals(lm(X$hyperMeth ~ X$ncov))
## Statistical model: is it different in each offspring trt group?
mod1 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
mod0 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
print(lrtest(mod1, mod0))
## Post-hoc test:
modhypo <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X)
## pairwise posthoc test
print(emmeans(modhypo, list(pairwise ~ Treatment), adjust = "tukey"))
mod3 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
mod4 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
print(lrtest(mod3, mod4))
## Post-hoc test:
modhyper <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X)
## pairwise posthoc test
print(emmeans(modhyper, list(pairwise ~ Treatment), adjust = "tukey"))
## Plot
phypo <- plot(ggpredict(modhypo, terms = c("Treatment")), add.data = TRUE)+
scale_y_continuous("Residuals of number of hypomethylated methylated \ncytosines on number of cytosines covered") +
ggtitle("Predicted residuals nbr of hypomethylated CpG")+
theme_bw()
phyper <- plot(ggpredict(modhyper, terms = c("Treatment")), add.data = TRUE)+
scale_y_continuous("Residuals of number of hypermethylated methylated \n cytosines on number of cytosines covered") +
ggtitle("Predicted residuals nbr of hypermethylated CpG")+
theme_bw()
return(list(phypo, phyper))
}
listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hypo",])
## NOT significant
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
top = text_grob("Parental DMS hypomethylated upon infection, in offspring"))
listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hyper",])
## Treatment SIGNIFICANT in both excess hypo/hyper methylation **
# $`pairwise differences of Treatment`
## HYPO
# 1 estimate SE df t.ratio p.value
# NE_control - E_control 23.71 7.28 10.3 3.257 0.0353
# NE_control - E_exposed 26.88 7.26 10.3 3.701 0.0172
## HYPER
# 1 estimate SE df t.ratio p.value
# NE_control - E_control -24.06 7.36 10.3 -3.269 0.0348
# NE_control - E_exposed -27.07 7.34 10.3 -3.687 0.0177
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
top = text_grob("Parental DMS hypermethylated upon infection, in offspring"))
## The beta values in parentalDMS in offspring follow the parental pattern hypo/hyper methylated upon infection
######################################################################################
## II. CORE DMS: Which of the parental DMS are also found in offspring comparisons? ##
######################################################################################
intersect(paste(DMSlist$DMS_15pc_G1_C_T$chr, DMSlist$DMS_15pc_G1_C_T$start),
intersect(paste(DMSlist$DMS_15pc_CC_CT$chr, DMSlist$DMS_15pc_CC_CT$start),
paste(DMSlist$DMS_15pc_TC_TT$chr, DMSlist$DMS_15pc_TC_TT$start)))
## ONLY 4!!! "Gy_chrII 22196179" "Gy_chrXII 10863858" "Gy_chrXVII 2658079" "Gy_chrXX 5344222"
###############################################################
## CORE DMS: Which of the DMS are common to CC-CI and IC-II? ##
###############################################################
makeManP <- function(comp1, comp2){
A <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))
B <- as.data.frame(annotateWithGeneParts(as(A,"GRanges"),annotBed12)@members)
A2 <- methylKit::select(DMS2,
which(paste(DMS2$chr, DMS2$start) %in% coreDMS))
B2 <- as.data.frame(annotateWithGeneParts(as(A2,"GRanges"),annotBed12)@members)
ggarrange(
makeManhattanPlots(DMSfile = DMS1,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS1,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp1, " DMS")),
makeManhattanPlots(DMSfile = DMS2,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS2,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp2, " DMS")),
makeManhattanPlots(DMSfile = A, annotFile = B, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp1)),
makeManhattanPlots(DMSfile = A2, annotFile = B2, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp2)),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4, common.legend = T)
}
DMS1 = DMSlist$DMS_15pc_CC_CT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1CONTROL, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(5,6),])
DMS2 = DMSlist$DMS_15pc_TC_TT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1INFECTED, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(2,3),])
coreDMS <- intersect(paste(DMS1$chr, DMS1$start), paste(DMS2$chr, DMS2$start))
## Make Manhattan plot:
makeManP(comp1 = "CC-CI", comp2 = "IC-II")
## Annotation of the core DMS:
coreDMSmethylDiff <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))
## Differentially methylated sites:
subGOterms = getAnnotationFun(METHOBJ = coreDMSmethylDiff)
## Background annotations:
universeGOterms = getAnnotationFun(METHOBJ = uniteCov14_G2_woSexAndUnknowChrOVERLAP)
length(universeGOterms)# 16024
# as.vector(lapply(strsplit(as.character(TSSAssociation_DiffMeth15p$feature.name), "\\."), "[", 1))
# Using the genes with associated pop-DMS, we applied
# a conditional hypergeometric Gene Ontology (GO) term enrichment
# analysis (false discovery rate–corrected P ≤ 0.05) with the Ensembl
# stickleback annotation dataset “gaculeatus_gene_ensembl,” and all
# genes that were associated to any sequenced CpG site were used as
# universe. We identified overrepresented biological processes, molec-
# ular functions, and cellular components using the packages GOstats
# version 2.5 (59) and GSEABase version 1.46 (60) and corrected for
# multiple testing using the false discovery rate method implemented
# in goEnrichment version 1.0 (61) in R version 3.6 (52). Figures were
# produced using ggplot2 version 3.2 (56)
#######################################################################
## TRANSMITTED DMS: Which of the DMS are found in CCvsIC and CIvsII? ##
#######################################################################
makeManP(DMS1 = DMS15pc_G1_controlG2_half, comp1 = "CC-IC",
DMS2 = DMS15pc_G1_infectedG2_half, comp2 = "CI-II")
##############
## Plot mean Beta values of offsprings at parental DMS, per trt, along the chromosomes
##############
meanBeta_G2_simple <- PM_G2 %>% group_by(Chr, Pos, Treatment) %>%
dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_simple) <- c("Chr", "Pos", "Treatment_G2", "MeanBetaG2")
genome <- GYgynogff %>%
mutate(chrom_nr=chrom %>% deroman(), chrom_order=factor(chrom_nr) %>% as.numeric()) %>%
arrange(chrom_order) %>%
mutate(gstart=lag(length,default=0) %>% cumsum(), gend=gstart+length,
type=LETTERS[2-(chrom_order%%2)], gmid=(gstart+gend)/2)
mydata = tibble(trt=meanBeta_G2_simple$Treatment_G2,
chrom=meanBeta_G2_simple$Chr, pos=meanBeta_G2_simple$Pos, beta=meanBeta_G2_simple$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
mydata$pos <- as.numeric(mydata$pos)
table(mydata$chrom)## check that chrXIX and chrUN are well removed!!
# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)
# plot:
ggplot()+
geom_rect(data=genome,aes(xmin=gstart,xmax=gend,ymin=-Inf,ymax=Inf,fill=type), alpha=.2)+
geom_point(data=data, aes(x=gpos,y=beta, shape=trt, col=trt),fill="white", size = 1)+
scale_color_manual(values = colOffs)+
scale_shape_manual(values=c(21,21,21,21))+
scale_fill_manual(values=c(A=rgb(.9,.9,.9),B=NA),guide="none")+
scale_x_continuous(breaks=genome$gmid,labels=genome$chrom %>% str_remove(.,"Gy_chr"),
position = "top",expand = c(0,0))+
theme_minimal()+
theme(panel.grid = element_blank(),
axis.line=element_blank(),
axis.title = element_blank(),
strip.placement = "outside")+
facet_grid(trt~.)+
ggtitle("Mean methylation proportions at the 3648 parental DMS for each offspring group")
###########################################################################################
## Not conclusive: test hyp that beta value is different (higher?) in the center of the chromosome,
## where there are less recombinations. Test for each chromosome
meanBeta_G2_extended <- PM_G2 %>% group_by(Chr, Pos, Treatment, Sex, brotherPairID) %>%
dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_extended) <- c("Chr", "Pos", "Treatment_G2","Sex", "brotherPairID", "MeanBetaG2")
mydata = tibble(trt=meanBeta_G2_extended$Treatment_G2, sex = meanBeta_G2_extended$Sex, brotherPairID = meanBeta_G2_extended$brotherPairID,
chrom=meanBeta_G2_extended$Chr, pos=meanBeta_G2_extended$Pos, beta=meanBeta_G2_extended$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)
## Add distance to center
data$dist2mid <- abs(data$gmid - data$gpos)
mod <- lmer(beta ~ dist2mid:chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod0 <- lmer(beta ~ dist2mid + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod00 <- lmer(beta ~ chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
lrtest(mod, mod0) # chromosome matters
lrtest(mod0, mod00) # distance to middle matters
## check normality of residuals assumption
qqnorm(resid(mod))
qqline(resid(mod)) # quite skewed
pred <- ggpredict(mod, terms = c("dist2mid","chrom"))
plot(pred) +
scale_color_manual(values = 1:20)